# -*- coding: utf-8 -*-


import jieba   #分词
#jieba.load_userdict("D:/vocabulary1.txt")
from jieba import posseg
import codecs



# 预置词库(将库里的词汇表存入txt)
def Get_Vocabulary(databaseSimulation, filepath):
    #databaseSimulation = MySQLDB.MySQLDB("172.25.28.10", "3306", username="stock", password="stock123@PWD")
    tag_keyword = databaseSimulation.Find('rec_article', 'tag_keyword')
    vocabulary = []
    f = codecs.open(filepath, 'w+', 'utf-8')
    for tag in tag_keyword:
        word = tag['name']
        vocabulary.append(word)
        f.write(word + '\r\n')
    f.close

    return vocabulary


#切词
def WordSegment_Jieba(text):
    #jieba.load_userdict("D:/vocabulary1.txt")
    # 词性
    pos = ['n', 'nz', 'nt', 'nr', 'ns', 'v', 'vn', 'j', 'x']
    raw_segments = posseg.cut(text)

    segments = []
    for j in raw_segments:
        if (len(j.word) >= 2) and (j.flag in pos):
            segments.append([j.word, j.flag])


    return segments

#text = "开放式基金下证券投资基金的国债逆回购"
#segments = WordSegment_Jieba(text)
#print(segments)